{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import zipfile\n",
    "\n",
    "import numpy as np\n",
    "from gensim.models import word2vec\n",
    "from gensim.models.doc2vec import Word2Vec\n",
    "from keras.layers import Activation, Embedding, Merge, Reshape\n",
    "from keras.models import Sequential\n",
    "from keras.preprocessing.sequence import skipgrams, make_sampling_table\n",
    "from keras.preprocessing.text import Tokenizer, base_filter\n",
    "from keras.utils.data_utils import get_file"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Setting Hyperparameters\n",
    "You set hyperparameters for Skip-gram with negative sampling.\n",
    "\n",
    "By default, it is set as follows."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Hyper Parameter Settings\n",
    "embedding_size = 200\n",
    "epochs_to_train = 10\n",
    "num_neg_samples = 5\n",
    "sampling_factor = 1e-5\n",
    "window_size = 5\n",
    "save_path = './word_vectors.txt'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Download training and evaluation data\n",
    "You can download training data and evaluation data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading data from http://mattmahoney.net/dc/text8.zip\n",
      "31309824/31344016 [============================>.] - ETA: 0sDownloading data from http://download.tensorflow.org/data/questions-words.txt\n",
      "  8192/603955 [..............................] - ETA: 0s"
     ]
    }
   ],
   "source": [
    "def maybe_download(url):\n",
    "    \"\"\"\n",
    "    Download a file if not present.\n",
    "    \"\"\"\n",
    "    filename = url.split('/')[-1]\n",
    "    path = get_file(filename, url)\n",
    "    return path\n",
    "    \n",
    "\n",
    "def unzip(zip_filename):\n",
    "    \"\"\"\n",
    "    Extract a file from the zipfile\n",
    "    \"\"\"\n",
    "    with zipfile.ZipFile(zip_filename) as f:\n",
    "        for filename in f.namelist():\n",
    "            dirname = os.path.dirname(filename)\n",
    "            f.extract(filename, dirname)\n",
    "            return os.path.abspath(filename)\n",
    "            \n",
    "\n",
    "# Download Data\n",
    "url = 'http://mattmahoney.net/dc/text8.zip'\n",
    "filename = maybe_download(url)\n",
    "text_file = unzip(filename)\n",
    "url = 'http://download.tensorflow.org/data/questions-words.txt'\n",
    "eval_data = maybe_download(url)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Reading training data\n",
    "You can read training data from a text file using the word2vec.Text8Corpus class. \n",
    "\n",
    "By default, it assumes that the text file is given.\n",
    "\n",
    "Tokenizer tokenizes the sentences and assign ID to the vocabulary."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Vocabulary: 253855\n"
     ]
    }
   ],
   "source": [
    "# Load Data\n",
    "sentences = word2vec.Text8Corpus(text_file)\n",
    "sentences = [' '.join(sent) for sent in sentences]\n",
    "tokenizer = Tokenizer(filters=base_filter() + \"'\")\n",
    "tokenizer.fit_on_texts(sentences)\n",
    "sentences = tokenizer.texts_to_sequences(sentences)\n",
    "V = len(tokenizer.word_index) + 1\n",
    "print('Vocabulary:', V)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Layers and Activation Functions\n",
    "The two main components to build neural networks architecture in Keras is Layer and Activation. There are many useful layers and activation functions in Keras.  \n",
    "\n",
    "For now, let's create a binary classifier with Embedding layer:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def build_model():\n",
    "    target_word = Sequential()\n",
    "    target_word.add(Embedding(V, embedding_size, input_length=1))\n",
    "\n",
    "    context = Sequential()\n",
    "    context.add(Embedding(V, embedding_size, input_length=1))\n",
    "\n",
    "    model = Sequential()\n",
    "    model.add(Merge([target_word, context], mode='dot', dot_axes=2))\n",
    "    model.add(Reshape((1,), input_shape=(1, 1)))\n",
    "    model.add(Activation('sigmoid'))\n",
    "    model.compile(loss='binary_crossentropy', optimizer='rmsprop')\n",
    "\n",
    "    return model\n",
    "\n",
    "model = build_model()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Training Model\n",
    "Now, we obtained skip-gram model. Let's train it by calling train_on_batch and passing training examples:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0/1701\n",
      "500/1701\n",
      "1000/1701\n",
      "1500/1701\n",
      "num epoch: 0 loss: 529.3742761611938\n",
      "0/1701\n",
      "500/1701\n",
      "1000/1701\n",
      "1500/1701\n",
      "num epoch: 1 loss: 290.1124480217695\n",
      "0/1701\n",
      "500/1701\n",
      "1000/1701\n",
      "1500/1701\n",
      "num epoch: 2 loss: 260.58118246495724\n",
      "0/1701\n",
      "500/1701\n",
      "1000/1701\n",
      "1500/1701\n",
      "num epoch: 3 loss: 245.68536188453436\n",
      "0/1701\n",
      "500/1701\n",
      "1000/1701\n",
      "1500/1701\n"
     ]
    }
   ],
   "source": [
    "def train_model(model):\n",
    "    sampling_table = make_sampling_table(V, sampling_factor=sampling_factor)\n",
    "    for epoch in range(epochs_to_train):\n",
    "        loss = 0.\n",
    "        for i, sent in enumerate(sentences):\n",
    "            if i % 500 == 0:\n",
    "                print('{}/{}'.format(i, len(sentences)))\n",
    "            couples, labels = skipgrams(sequence=sent, vocabulary_size=V, window_size=window_size,\n",
    "                                        negative_samples=num_neg_samples, sampling_table=sampling_table)\n",
    "            if couples:\n",
    "                words, contexts = zip(*couples)\n",
    "                words = np.array(words, dtype=np.int32)\n",
    "                contexts = np.array(contexts, dtype=np.int32)\n",
    "                y = np.array(labels, dtype=np.int32)\n",
    "                loss += model.train_on_batch([words, contexts], y)\n",
    "        print('num epoch: {} loss: {}'.format(epoch, loss))\n",
    "\n",
    "    return model\n",
    "\n",
    "model = train_model(model)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Save word embeddings\n",
    "Congraturations! We finished training. \n",
    "Let's save word embeddings into text file:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def save_model(model):\n",
    "    with open(save_path, 'w') as f:\n",
    "        f.write(' '.join([str(V - 1), str(embedding_size)]))\n",
    "        f.write('\\n')\n",
    "        vectors = model.get_weights()[0]\n",
    "        for word, i in tokenizer.word_index.items():\n",
    "            f.write(word)\n",
    "            f.write(' ')\n",
    "            f.write(' '.join(map(str, list(vectors[i, :]))))\n",
    "            f.write('\\n')\n",
    "            \n",
    "save_model(model)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Definition for reading evaluation data\n",
    "You can read evaluation data from a text file:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def read_analogies(filename, word2id):\n",
    "    \"\"\"\n",
    "    Reads through the analogy question file.\n",
    "\n",
    "    Returns:\n",
    "      questions: a [n, 4] numpy array containing the analogy question's word ids.\n",
    "      questions_skipped: questions skipped due to unknown words.\n",
    "    \"\"\"\n",
    "    questions = []\n",
    "    questions_skipped = 0\n",
    "    with open(filename, 'r') as analogy_f:\n",
    "        for line in analogy_f:\n",
    "            if line.startswith(':'):  # Skip comments.\n",
    "                continue\n",
    "            words = line.strip().lower().split()\n",
    "            ids = [w in word2id for w in words]\n",
    "            if False in ids or len(ids) != 4:\n",
    "                questions_skipped += 1\n",
    "            else:\n",
    "                questions.append(words)\n",
    "    print('Eval analogy file: {}'.format(filename))\n",
    "    print('Questions: {}'.format(len(questions)))\n",
    "    print('Skipped: {}'.format(questions_skipped))\n",
    "    return questions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Evaluation\n",
    "We evaluate obtained word embeddings on analogy task. \n",
    "\n",
    "In analogy task, when A, B, C are given, you need to find D such that A is to B what C is to D.(e.g. man is to king what women is to D.)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Eval analogy file: /home/ubuntu/.keras/datasets/questions-words.txt\n",
      "Questions: 19106\n",
      "Skipped: 438\n",
      "Eval 2936/19106 accuracy = 15.4%\n"
     ]
    }
   ],
   "source": [
    "def eval_model():\n",
    "    w2v = Word2Vec.load_word2vec_format(save_path, binary=False)\n",
    "    w2v.most_similar(positive=['country'])\n",
    "    word2id = dict([(w, i) for i, w in enumerate(w2v.index2word)])\n",
    "    analogy_questions = read_analogies(eval_data, word2id)\n",
    "    correct = 0\n",
    "    total = len(analogy_questions)\n",
    "    for question in analogy_questions:\n",
    "        a, b, c, d = question  # E.g. [Athens, Greece, Baghdad, Iraq]\n",
    "        analogies = w2v.most_similar(positive=[b, c], negative=[a], topn=4)\n",
    "        for analogy in analogies:\n",
    "            word, _ = analogy\n",
    "            if d == word:\n",
    "                # Predicted Correctly!\n",
    "                correct += 1\n",
    "                break\n",
    "    print('Eval %4d/%d accuracy = %4.1f%%' % (correct, total, correct * 100.0 / total))\n",
    "    \n",
    "eval_model()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
